{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import wfdb\n",
    "import numpy as np\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "from IPython.display import display"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Segmenting the data from ECG DB into 3 min segments all listed as arrays"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read all the ECG signal files being used (a01-a05 and c01-c05) into one list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ECGFiles = [] # An List with all the ECG Signal filenames\n",
    "for root, dirs, files in os.walk(r'C:\\Python27\\Notebooks\\wfdb\\apneaecg\\Labelled1'):\n",
    "    for file in files:\n",
    "        if file.endswith('.dat'):\n",
    "            parts = file.split('.')  #split the file using . as separator\n",
    "            ECGFiles.append(parts[0])  # append only the first part of the file without the extension"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(ECGFiles), ECGFiles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Take each ECG file and convert it into a list of arrays of 3 minute segments\n",
    "(Note each segment overlaps with the next 2 segments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    " cd \"C:/Python27/Notebooks/wfdb/apneaecg/Labelled1/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def ECG_Segment(ECG_Signal_file):\n",
    "    record = wfdb.rdsamp(ECG_Signal_file)\n",
    "    bfunc = np.asarray(record.p_signals)\n",
    "    rawECGFunc = []\n",
    "    for i in range((bfunc.size / 6000)-2):\n",
    "        rawECGFunc.append(bfunc[i*6000:((i*6000)+18000)])\n",
    "    return rawECGFunc\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create list containing the data\n",
    "AllDataSegmented = []\n",
    "for i in ECGFiles:\n",
    "    Seg = ECG_Segment(i)\n",
    "    AllDataSegmented.append(Seg)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# We should have 35 arrays of segmented data\n",
    "len(AllDataSegmented)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "L = AllDataSegmented[0][5][16000:17000,0]\n",
    "plt.plot(L)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now we have all our segments below details how to use them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# As seen above we have 35 elements in the list AllDataSegmented\n",
    "# To extract all the segments for one ECG file use AllDataSegmented[X] eg\n",
    "b = AllDataSegmented[4]\n",
    "\n",
    "# To extract one of the 3 min segments from this ECG signal use b = AllDataSegmented[X][X]\n",
    "d = AllDataSegmented[4][3][12000:18000]\n",
    "\n",
    "# to extract one section eg 60seconds of one of the 3 min segments of 1 of the signals use b = AllDataSegmented[X][X][0:6000]\n",
    "e = AllDataSegmented[1][3][0:18000]\n",
    "\n",
    "# To extract 1 3 min segment as a flat array rather than an array of lists containing 1 value\n",
    "L = AllDataSegmented[1][3][:18000,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.plot(d)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calc RRIntervals for the segments"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Try putting High pass filter in front of the RR Peak detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cd \"C:\\Python27\\Notebooks\\wfdb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "L = AllDataSegmented[4][3][:18000,0]    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy import signal\n",
    "import matplotlib.pyplot as plt\n",
    "def sine_generator(fs, sinefreq, duration):\n",
    "    T = duration\n",
    "    nsamples = fs * T\n",
    "    w = 2. * np.pi * sinefreq\n",
    "    t_sine = np.linspace(0, T, nsamples, endpoint=False)\n",
    "    y_sine = np.sin(w * t_sine)\n",
    "    result = pd.DataFrame({ \n",
    "        'data' : y_sine} ,index=t_sine)\n",
    "    return result\n",
    "\n",
    "#High Pass filter\n",
    "\n",
    "def butter_highpass(cutoff, fs, order=15):\n",
    "    nyq = 0.5 * fs\n",
    "    normal_cutoff = cutoff / nyq\n",
    "    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)\n",
    "    return b, a\n",
    "\n",
    "def butter_highpass_filter(data, cutoff, fs, order=1):\n",
    "    b, a = butter_highpass(cutoff, fs, order=order)\n",
    "    y = signal.filtfilt(b, a, data)\n",
    "    return y\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Demonstrating the high pass filter on a signal with 5Hz and 1Hz elements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "fps = 100\n",
    "sine_fq = 5 #Hz\n",
    "duration = 5 #seconds\n",
    "sine_5Hz = sine_generator(fps,sine_fq,duration)\n",
    "sine_fq = 1 #Hz\n",
    "duration = 5 #seconds\n",
    "sine_1Hz = sine_generator(fps,sine_fq,duration)\n",
    "\n",
    "sine_fq = 40 #Hz\n",
    "duration = 5 #seconds\n",
    "sine_40Hz = sine_generator(fps,sine_fq,duration)\n",
    "\n",
    "sine = sine_5Hz + sine_1Hz + sine_40Hz\n",
    "\n",
    "filtered_sine = butter_highpass_filter(sine.data,20,fps)\n",
    "\n",
    "plt.figure(figsize=(20,10))\n",
    "plt.subplot(211)\n",
    "plt.plot(range(len(sine)),sine)\n",
    "plt.title('generated signal')\n",
    "plt.subplot(212)\n",
    "plt.plot(range(len(filtered_sine)),filtered_sine)\n",
    "plt.title('filtered signal')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Filtering using high pass filter to allow any frequency above 0.5 \n",
    "# should remove respiratory artifacts (0.12Hz to 0.50HZ) ie 8-30 breaths per minute\n",
    "#### BUT found that 20Hz cut off gave better suppression of non-R peak aspects of the ECG\n",
    "\n",
    "# Trying data filtering\n",
    "t_ECG = np.linspace(0,10, 1000, endpoint=False)\n",
    "ECG = pd.DataFrame({'data' : L}, index=t_ECG)\n",
    "\n",
    "filtered_ecg = butter_highpass_filter(ECG.data,20,fps)   # 20 here is the 20Hz cutoff\n",
    "\n",
    "plt.figure(figsize=(20,10))\n",
    "plt.subplot(211)\n",
    "plt.plot(range(len(ECG)),ECG)\n",
    "plt.title('generated signal')\n",
    "plt.subplot(212)\n",
    "plt.plot(range(len(filtered_ecg)),filtered_ecg)\n",
    "plt.title('filtered signal')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Plot an example of one of the data segments and its filtered version\n",
    "\n",
    "plt.figure(figsize=(20,10))\n",
    "plt.subplot(211)\n",
    "plt.plot(range(len(AllDataSegmented[16][3])),AllDataSegmented[16][3])\n",
    "plt.title('generated signal')\n",
    "plt.subplot(212)\n",
    "plt.plot(range(len(AllDataFiltered[16][3])),AllDataFiltered[16][3])\n",
    "plt.title('filtered signal')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lets only take the mid sections of the data - cleaner data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "MidDataSegments = []\n",
    "for i in AllDataSegmented:\n",
    "    MidSeg = i[75:(len(i))-75]\n",
    "    MidDataSegments.append(MidSeg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#L = MidDataSegments[0][1][10000:12000,0]\n",
    "L = MidDataSegments[0][5]\n",
    "plt.plot(L)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lets run the filter against all our data segments\n",
    "(Remember that each ECG signal is for a different total time length so each MidDataSegmented will have a different length len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "AllDataFiltered = []\n",
    "t_ECG = np.linspace(0,180, 18000, endpoint=False)    # Setup the index for the dataframe used\n",
    "for i in MidDataSegments:\n",
    "    SegmentsFiltered = []\n",
    "    for j in i:\n",
    "        TempSig = j[:18000,0]\n",
    "        ECG = pd.DataFrame({'data' : TempSig}, index=t_ECG)\n",
    "        filtered_ecg = butter_highpass_filter(ECG.data,20,fps)\n",
    "        SegmentsFiltered.append(filtered_ecg)\n",
    "    AllDataFiltered.append(SegmentsFiltered)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Might be a good time to dump the data into a dataframe and then into a pickle file which can be opened from this stage then without rerunning all above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Create pkl file represenation of filtered data (Note it's 2.2GB in size)\n",
    "\n",
    "import joblib\n",
    "iterate =0\n",
    "for i in AllDataFiltered:\n",
    "    filename = 'AllDataFiltered'\n",
    "    filenumber = str(iterate)\n",
    "    fileextension = '.pkl'\n",
    "    name = filename+filenumber+fileextension\n",
    "    joblib.dump(AllDataFiltered[iterate], name, compress=3)  \n",
    "    #print(name)\n",
    "    iterate = iterate + 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "AllDataFiltered = joblib.load('AllDataFiltered.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Load pickle file fitlered data\n",
    "\n",
    "import joblib\n",
    "iterate = 0\n",
    "AllDataFilteredTest = []\n",
    "while iterate < 35:\n",
    "    filename = 'AllDataFiltered'\n",
    "    filenumber = str(iterate)\n",
    "    fileextension = '.pkl'\n",
    "    name = filename+filenumber+fileextension\n",
    "    AllDataFilteredTest.append(joblib.load(name))\n",
    "    iterate = iterate + 1\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Heart rate calculator using threshold method\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def RRInterval(SegmentIn):\n",
    "    overThres = [] \n",
    "    mean = SegmentIn.mean()\n",
    "    std = SegmentIn.std()\n",
    "    Thres = mean + (2 * std) # use criteria that peaks are at least 2 X std deviation above mean\n",
    "    print Thres\n",
    "    for i in SegmentIn:\n",
    "        overThres.append(1*(i>Thres))    # Find those over the threshold \n",
    "    overThresnp = np.array(overThres)\n",
    "    changes = overThresnp[2:18000]-overThresnp[1:17999]\n",
    "    ndx = []\n",
    "    for i in range(17998):\n",
    "        if changes[i] > 0:\n",
    "            ndx.append(i+2) \n",
    "    ndxlen=len(ndx)\n",
    "    npndx =np.array(ndx)\n",
    "    RRIntervals = npndx[2:ndxlen]-npndx[1:ndxlen-1]\n",
    "    lenRR = len(RRIntervals)\n",
    "    HRs = []\n",
    "    for i in range(lenRR):\n",
    "        HRs.append (6000/RRIntervals[i])\n",
    "    return RRIntervals, HRs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correction function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def RRCorrection (RRIntervalValues):\n",
    "    Anp = np.array(RRIntervalValues)\n",
    "    Amean = Anp.mean()\n",
    "    A = RRIntervalValues\n",
    "    for c in range(len(A)):\n",
    "       if A[c] > Amean+(Amean/3):\n",
    "        print A[c]\n",
    "        A[c] = Amean\n",
    "        print A[c]\n",
    "       if A[c] < Amean-(Amean/3):\n",
    "        print A[c]\n",
    "        A[c] = Amean\n",
    "        print A[c]\n",
    "    return A"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use this function to run the RRInterval and Heart Rate calulator and the correction functions on the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def GetHeartRates (FilteredSig):\n",
    "    AllHeartRates = []\n",
    "    AllRRs = []\n",
    "    SegmentRRs = []\n",
    "    SegmentHRs = []\n",
    "    iterate2 = 0\n",
    "    for j in FilteredSig:\n",
    "        print(\"...\")\n",
    "        print(iterate2)\n",
    "        iterate2 = iterate2 + 1\n",
    "        TempSig = j[:18000]\n",
    "        TempRR, TempHR = RRInterval(TempSig)\n",
    "        TempCorrectedHR = RRCorrection(TempHR)\n",
    "        TempCorrectedRRInt = RRCorrection(TempRR)\n",
    "        SegmentHRs.append(TempCorrectedHR)\n",
    "        SegmentRRs.append(TempCorrectedRRInt)\n",
    "    AllHeartRates.append(SegmentHRs)\n",
    "    AllRRs.append(SegmentRRs)\n",
    "    return AllRRs, AllHeartRates"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use above funtion to get HeartRates for signals to be used for trainign the data 5 signals from Apnea patients a01-a05 and 5 from control set c01-05"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# These take some time to run\n",
    "RRa01, HRa01 = GetHeartRates (AllDataFiltered[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRa02, HRa02 = GetHeartRates (AllDataFiltered[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRa03, HRa03 = GetHeartRates (AllDataFiltered[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRa04, HRa04 = GetHeartRates (AllDataFiltered[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRa05, HRa05 = GetHeartRates (AllDataFiltered[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRc01, HRc01 = GetHeartRates (AllDataFiltered[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRc02, HRc02 = GetHeartRates (AllDataFiltered[6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRc03, HRc03 = GetHeartRates (AllDataFiltered[7])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRc04, HRc04 = GetHeartRates (AllDataFiltered[8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RRc05, HRc05 = GetHeartRates (AllDataFiltered[9])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Lets dump this stage of data - do for each set of HR data\n",
    "import joblib\n",
    "joblib.dump(HRc01[0], 'HRc01.pkl', compress=3)  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Statistical calculations RRIntervals NN50 values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "'''Calc NN50 measure (variant 1), defined as the number of pairs of adjacent RR- intervals \n",
    "where the first RR- interval exceeds the second RR- interval by more than 50 ms.\n",
    "Note my calc is not strictly NN50 because the resolution doesn't show if it's 51ms only if it's 60ms greater\n",
    "'''\n",
    "\n",
    "def NN50 (RRintervals):\n",
    "    NN50_count = 0\n",
    "    temp = 999\n",
    "    for i in RRintervals:\n",
    "        if (temp - i) > 5:\n",
    "            NN50_count = NN50_count + 1\n",
    "        temp = i\n",
    "    return NN50_count\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "'''The NN50 measure (variant 2), defined as the number of pairs of adjacent RR-intervals where the \n",
    "second RR- interval exceeds the first RR interval by more than 50 ms\n",
    "'''\n",
    "def NN50_count2 (RRint):\n",
    "    NN50_count2 = 0\n",
    "    temp = 999\n",
    "    for i in RRint:\n",
    "        if (i - temp) > 5:\n",
    "            NN50_count2 = NN50_count2 + 1\n",
    "        temp = i\n",
    "    return NN50_count2\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### We want to add stats to each stats array by giving this function our arrays of RR-Intervals and HRs and the arrays for the stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def HRstats_new(RRArray, HRArray,HRm,HRs,NN501,NN502):\n",
    "    for i in HRArray:\n",
    "        #print (i)\n",
    "        HRm.append(np.mean(i))\n",
    "        HRs.append(np.std(i))\n",
    "    for i in RRArray:\n",
    "        NN501.append(NN50(i))\n",
    "        NN502.append(NN50_count2(i))\n",
    "    return HRm,HRs,NN501,NN502"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create datafeatures and classifys buckets for training and validating the classifier then add the HR stats and the annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "datafeatures = []        #Hashing this out to allow appending the next lot of data\n",
    "classifys = []           #Hashing this out to allow appending the next lot of data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def prepclassifiercontainers(datafeatures, classifys):\n",
    "    iterate = 0\n",
    "    if len(MyAnn) < len(HRmean):\n",
    "        for i in MyAnn:\n",
    "            data_features = []\n",
    "            data_features.append(HRmean[iterate])\n",
    "            data_features.append(HRstd[iterate])\n",
    "            data_features.append(NN50Result[iterate])\n",
    "            data_features.append(NN502Result[iterate])\n",
    "            classifys.append(MyAnn[iterate])\n",
    "            iterate=iterate+1\n",
    "            datafeatures.append(data_features)\n",
    "    else:\n",
    "        for i in HRmean:                 \n",
    "            data_features = []\n",
    "            data_features.append(HRmean[iterate])\n",
    "            data_features.append(HRstd[iterate])\n",
    "            data_features.append(NN50Result[iterate])\n",
    "            data_features.append(NN502Result[iterate])\n",
    "            classifys.append(MyAnn[iterate])\n",
    "            iterate=iterate+1\n",
    "            datafeatures.append(data_features)\n",
    "    return datafeatures, classifys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Function below to pull in the annotations for the "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Annot(sigfile):\n",
    "    annotation = wfdb.rdann(sigfile, 'apn')\n",
    "    AnnSym = annotation.symbol\n",
    "    MyAnn = [] # initialise the container\n",
    "    for i in range (len(AnnSym)-149):             # Using only the mid section of the signal so taking 75 minutes off each end \n",
    "        if AnnSym[i+74] == 'A' and AnnSym[i+75] == 'A' and AnnSym[i+76] == 'A': \n",
    "            Ann = 'A'\n",
    "        else :\n",
    "            Ann = 'N'\n",
    "        MyAnn.append(Ann)\n",
    "    return MyAnn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run the step below agaisnt each signal file to populate the datafeatures and classifys (labels) arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "HRmean = []\n",
    "HRstd = []\n",
    "NN50Result = []\n",
    "NN502Result = []\n",
    "HRstats_new(RRc05[0], HRc05[0], HRmean, HRstd, NN50Result, NN502Result) \n",
    "        ## Change this  to the signal file you're populating featuress from\n",
    "MyAnn = Annot('c05')   ## Change this each time to the signal file you're populating labels from\n",
    "datafeatures, classifys = prepclassifiercontainers(datafeatures, classifys)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    " len(datafeatures), len(classifys), datafeatures,  classifys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Shuffle the data before using it for training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "iterate = 0\n",
    "for i in datafeatures:\n",
    "    i.append(classifys[iterate])\n",
    "    iterate = iterate + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "random.shuffle(datafeatures)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifys = []\n",
    "iterate = 0\n",
    "for i in datafeatures:\n",
    "    classifys.append(i[4])\n",
    "    i.pop(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'classifys_shuffled.pkl'\n",
    "pickle.dump(classifys, open(filename, 'wb'))\n",
    "filename = 'datafeatures_shuffled.pkl'\n",
    "pickle.dump(datafeatures, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'classifys_shuffled.pkl'\n",
    "classifys = pickle.load(open(filename, 'rb'))\n",
    "filename = 'datafeatures_shuffled.pkl'\n",
    "datafeatures = pickle.load(open(filename, 'rb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lets try classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Do the initial classifer training and testing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Split the data into a training and test set 90:10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dflen = len(datafeatures)\n",
    "split = (9*dflen/10)            # Split is 9/10 of the total dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Running the SVM (support vector model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Running the SVM (support vector model)\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "svm = SVC(C=1.0, gamma='auto', kernel='rbf')\n",
    "svm.fit(datafeatures[:split], classifys[:split])        # We're training on just the data features up to the split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Make an array of predictions on the validation set\n",
    "predictions = svm.predict(datafeatures[split+1:dflen-1])        # getting predictions based on the held back validation\n",
    "\n",
    "# Output the percentage accuracy, total predicted correctly and the confusion matrix for each model\n",
    "print(svm.score(datafeatures[split+1:dflen-1], classifys[split+1:dflen-1]))\n",
    "print(confusion_matrix(predictions, classifys[split+1:dflen-1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the svm prediction engine to file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cd \"C:\\Python27\\Notebooks\\wfdb\\apneaecg\\Labelled\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'finalsvm.pkl'\n",
    "pickle.dump(svm, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'datafeatures.pkl'\n",
    "pickle.dump(datafeatures, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'classifys.pkl'\n",
    "pickle.dump(classifys, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'finalsvm.pkl'\n",
    "svm2 = pickle.load(open(filename, 'rb'))\n",
    "#print(svm2.score(validation_data_features, validation_classifys))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Try different classifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'classifys.pkl'\n",
    "classifys = pickle.load(open(filename, 'rb'))\n",
    "filename = 'datafeatures.pkl'\n",
    "datafeatures = pickle.load(open(filename, 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cd 'C:\\Python27\\Notebooks\\wfdb\\apneaecg\\Labelled1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dflen = len(datafeatures)\n",
    "split = (9*dflen/10)     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "svm = SVC(C=1.0, gamma=0.1, kernel='rbf')\n",
    "svm.fit(datafeatures[:split], classifys[:split])        # We're training on just the dataset up to the split\n",
    "\n",
    "# Make an array of predictions on the validation set\n",
    "predictions = svm.predict(datafeatures[split+1:dflen-1])        # getting predictions based on the held back validation set\n",
    "\n",
    "# Output the percentage accuracy, total predicted correctly and the confusion matrix for each model\n",
    "print(svm.score(datafeatures[split+1:dflen-1], classifys[split+1:dflen-1]))\n",
    "print(confusion_matrix(predictions, classifys[split+1:dflen-1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import joblib\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.svm import SVC\n",
    "from sklearn import svm\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn import linear_model, metrics\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifier = RandomForestClassifier()\n",
    "\n",
    "classifier.fit(datafeatures[:split], classifys[:split])\n",
    "\n",
    "# Make an array of predictions on the validation set\n",
    "predictions = classifier.predict(datafeatures[split+1:dflen-1])\n",
    "\n",
    "# Output the percentage accuracy, total predicted correctly and the confusion matrix for each model\n",
    "print(classifier.score(datafeatures[split+1:dflen-1], classifys[split+1:dflen-1]))\n",
    "print(confusion_matrix(predictions, classifys[split+1:dflen-1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = 'RF_Class.pkl'\n",
    "pickle.dump(classifier, open(filename, 'wb'), protocol=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SVM SVC(C=10, gamma=0.001, kernel='rbf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifier = SVC(C=10, gamma=0.001, kernel='rbf')\n",
    "\n",
    "classifier.fit(datafeatures[:split], classifys[:split])\n",
    "\n",
    "# Make an array of predictions on the validation set\n",
    "predictions = classifier.predict(datafeatures[split+1:dflen-1])\n",
    "\n",
    "# Output the percentage accuracy, total predicted correctly and the confusion matrix for each model\n",
    "print(classifier.score(datafeatures[split+1:dflen-1], classifys[split+1:dflen-1]))\n",
    "print(confusion_matrix(predictions, classifys[split+1:dflen-1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## K-Nearest Neighbour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifier = KNeighborsClassifier()\n",
    "\n",
    "classifier.fit(datafeatures[:split], classifys[:split])\n",
    "\n",
    "# Make an array of predictions on the validation set\n",
    "predictions = classifier.predict(datafeatures[split+1:dflen-1])\n",
    "\n",
    "# Output the percentage accuracy, total predicted correctly and the confusion matrix for each model\n",
    "print(classifier.score(datafeatures[split+1:dflen-1], classifys[split+1:dflen-1]))\n",
    "print(confusion_matrix(predictions, classifys[split+1:dflen-1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "pickle.dump(classifier, open('KNear.pkl', 'wb'), protocol=0)  \n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Decision Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "classifier = DecisionTreeClassifier()\n",
    "\n",
    "classifier.fit(datafeatures[:split], classifys[:split])\n",
    "\n",
    "# Make an array of predictions on the validation set\n",
    "predictions = classifier.predict(datafeatures[split+1:dflen-1])\n",
    "\n",
    "# Output the percentage accuracy, total predicted correctly and the confusion matrix for each model\n",
    "print(classifier.score(datafeatures[split+1:dflen-1], classifys[split+1:dflen-1]))\n",
    "print(confusion_matrix(predictions, classifys[split+1:dflen-1]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
